import os
import warnings
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype
import plotly.express as px
from sklearn.feature_selection import mutual_info_regression
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
"axes",
labelweight="bold",
labelsize="large",
titleweight="bold",
titlesize=14,
titlepad=10,
)
# Mute warnings
warnings.filterwarnings('ignore')
data_dir = Path("Data and Submission")
df_train = pd.read_csv(data_dir / "train.csv", index_col="PassengerId")
df_test = pd.read_csv(data_dir / "test.csv", index_col="PassengerId")
df = pd.concat([df_train, df_test])
df.head()
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| PassengerId | |||||||||||
| 1 | 0.0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 2 | 1.0 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 3 | 1.0 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 4 | 1.0 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 5 | 0.0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.info()
## we have some null values in Age, Cabin, Embarked,Fare.
## will read the data description to try and figure the reason for that
<class 'pandas.core.frame.DataFrame'> Int64Index: 1309 entries, 1 to 1309 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null float64 1 Pclass 1309 non-null int64 2 Name 1309 non-null object 3 Sex 1309 non-null object 4 Age 1046 non-null float64 5 SibSp 1309 non-null int64 6 Parch 1309 non-null int64 7 Ticket 1309 non-null object 8 Fare 1308 non-null float64 9 Cabin 295 non-null object 10 Embarked 1307 non-null object dtypes: float64(3), int64(3), object(5) memory usage: 122.7+ KB
df.Age.isnull().sum()
## we will impute missing values of Age with back/forward sampling
263
df.Cabin.isnull().sum()
##will drop this column as ist seems not relatable
1014
print (df.Embarked.unique())
print(df.Embarked.isnull().sum())
##will impute it but will keep an eye for maybe dropping it as it seems not relatable
['S' 'C' 'Q' nan] 2
cols = df.columns
for col in df.columns:
print(col , df[col].unique())
Survived [ 0. 1. nan] Pclass [3 1 2] Name ['Braund, Mr. Owen Harris' 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)' 'Heikkinen, Miss. Laina' ... 'Saether, Mr. Simon Sivertsen' 'Ware, Mr. Frederick' 'Peter, Master. Michael J'] Sex ['male' 'female'] Age [22. 38. 26. 35. nan 54. 2. 27. 14. 4. 58. 20. 39. 55. 31. 34. 15. 28. 8. 19. 40. 66. 42. 21. 18. 3. 7. 49. 29. 65. 28.5 5. 11. 45. 17. 32. 16. 25. 0.83 30. 33. 23. 24. 46. 59. 71. 37. 47. 14.5 70.5 32.5 12. 9. 36.5 51. 55.5 40.5 44. 1. 61. 56. 50. 36. 45.5 20.5 62. 41. 52. 63. 23.5 0.92 43. 60. 10. 64. 13. 48. 0.75 53. 57. 80. 70. 24.5 6. 0.67 30.5 0.42 34.5 74. 22.5 18.5 67. 76. 26.5 60.5 11.5 0.33 0.17 38.5 ] SibSp [1 0 3 4 2 5 8] Parch [0 1 2 5 3 4 6 9] Ticket ['A/5 21171' 'PC 17599' 'STON/O2. 3101282' '113803' '373450' '330877' '17463' '349909' '347742' '237736' 'PP 9549' '113783' 'A/5. 2151' '347082' '350406' '248706' '382652' '244373' '345763' '2649' '239865' '248698' '330923' '113788' '347077' '2631' '19950' '330959' '349216' 'PC 17601' 'PC 17569' '335677' 'C.A. 24579' 'PC 17604' '113789' '2677' 'A./5. 2152' '345764' '2651' '7546' '11668' '349253' 'SC/Paris 2123' '330958' 'S.C./A.4. 23567' '370371' '14311' '2662' '349237' '3101295' 'A/4. 39886' 'PC 17572' '2926' '113509' '19947' 'C.A. 31026' '2697' 'C.A. 34651' 'CA 2144' '2669' '113572' '36973' '347088' 'PC 17605' '2661' 'C.A. 29395' 'S.P. 3464' '3101281' '315151' 'C.A. 33111' 'S.O.C. 14879' '2680' '1601' '348123' '349208' '374746' '248738' '364516' '345767' '345779' '330932' '113059' 'SO/C 14885' '3101278' 'W./C. 6608' 'SOTON/OQ 392086' '343275' '343276' '347466' 'W.E.P. 5734' 'C.A. 2315' '364500' '374910' 'PC 17754' 'PC 17759' '231919' '244367' '349245' '349215' '35281' '7540' '3101276' '349207' '343120' '312991' '349249' '371110' '110465' '2665' '324669' '4136' '2627' 'STON/O 2. 3101294' '370369' 'PC 17558' 'A4. 54510' '27267' '370372' 'C 17369' '2668' '347061' '349241' 'SOTON/O.Q. 3101307' 'A/5. 3337' '228414' 'C.A. 29178' 'SC/PARIS 2133' '11752' '7534' 'PC 17593' '2678' '347081' 'STON/O2. 3101279' '365222' '231945' 'C.A. 33112' '350043' '230080' '244310' 'S.O.P. 1166' '113776' 'A.5. 11206' 'A/5. 851' 'Fa 265302' 'PC 17597' '35851' 'SOTON/OQ 392090' '315037' 'CA. 2343' '371362' 'C.A. 33595' '347068' '315093' '363291' '113505' 'PC 17318' '111240' 'STON/O 2. 3101280' '17764' '350404' '4133' 'PC 17595' '250653' 'LINE' 'SC/PARIS 2131' '230136' '315153' '113767' '370365' '111428' '364849' '349247' '234604' '28424' '350046' 'PC 17610' '368703' '4579' '370370' '248747' '345770' '3101264' '2628' 'A/5 3540' '347054' '2699' '367231' '112277' 'SOTON/O.Q. 3101311' 'F.C.C. 13528' 'A/5 21174' '250646' '367229' '35273' 'STON/O2. 3101283' '243847' '11813' 'W/C 14208' 'SOTON/OQ 392089' '220367' '21440' '349234' '19943' 'PP 4348' 'SW/PP 751' 'A/5 21173' '236171' '347067' '237442' 'C.A. 29566' 'W./C. 6609' '26707' 'C.A. 31921' '28665' 'SCO/W 1585' '367230' 'W./C. 14263' 'STON/O 2. 3101275' '2694' '19928' '347071' '250649' '11751' '244252' '362316' '113514' 'A/5. 3336' '370129' '2650' 'PC 17585' '110152' 'PC 17755' '230433' '384461' '110413' '112059' '382649' 'C.A. 17248' '347083' 'PC 17582' 'PC 17760' '113798' '250644' 'PC 17596' '370375' '13502' '347073' '239853' 'C.A. 2673' '336439' '347464' '345778' 'A/5. 10482' '113056' '349239' '345774' '349206' '237798' '370373' '19877' '11967' 'SC/Paris 2163' '349236' '349233' 'PC 17612' '2693' '113781' '19988' '9234' '367226' '226593' 'A/5 2466' '17421' 'PC 17758' 'P/PP 3381' 'PC 17485' '11767' 'PC 17608' '250651' '349243' 'F.C.C. 13529' '347470' '29011' '36928' '16966' 'A/5 21172' '349219' '234818' '345364' '28551' '111361' '113043' 'PC 17611' '349225' '7598' '113784' '248740' '244361' '229236' '248733' '31418' '386525' 'C.A. 37671' '315088' '7267' '113510' '2695' '2647' '345783' '237671' '330931' '330980' 'SC/PARIS 2167' '2691' 'SOTON/O.Q. 3101310' 'C 7076' '110813' '2626' '14313' 'PC 17477' '11765' '3101267' '323951' 'C 7077' '113503' '2648' '347069' 'PC 17757' '2653' 'STON/O 2. 3101293' '349227' '27849' '367655' 'SC 1748' '113760' '350034' '3101277' '350052' '350407' '28403' '244278' '240929' 'STON/O 2. 3101289' '341826' '4137' '315096' '28664' '347064' '29106' '312992' '349222' '394140' 'STON/O 2. 3101269' '343095' '28220' '250652' '28228' '345773' '349254' 'A/5. 13032' '315082' '347080' 'A/4. 34244' '2003' '250655' '364851' 'SOTON/O.Q. 392078' '110564' '376564' 'SC/AH 3085' 'STON/O 2. 3101274' '13507' 'C.A. 18723' '345769' '347076' '230434' '65306' '33638' '113794' '2666' '113786' '65303' '113051' '17453' 'A/5 2817' '349240' '13509' '17464' 'F.C.C. 13531' '371060' '19952' '364506' '111320' '234360' 'A/S 2816' 'SOTON/O.Q. 3101306' '113792' '36209' '323592' '315089' 'SC/AH Basle 541' '7553' '31027' '3460' '350060' '3101298' '239854' 'A/5 3594' '4134' '11771' 'A.5. 18509' '65304' 'SOTON/OQ 3101317' '113787' 'PC 17609' 'A/4 45380' '36947' 'C.A. 6212' '350035' '315086' '364846' '330909' '4135' '26360' '111427' 'C 4001' '382651' 'SOTON/OQ 3101316' 'PC 17473' 'PC 17603' '349209' '36967' 'C.A. 34260' '226875' '349242' '12749' '349252' '2624' '2700' '367232' 'W./C. 14258' 'PC 17483' '3101296' '29104' '2641' '2690' '315084' '113050' 'PC 17761' '364498' '13568' 'WE/P 5735' '2908' '693' 'SC/PARIS 2146' '244358' '330979' '2620' '347085' '113807' '11755' '345572' '372622' '349251' '218629' 'SOTON/OQ 392082' 'SOTON/O.Q. 392087' 'A/4 48871' '349205' '2686' '350417' 'S.W./PP 752' '11769' 'PC 17474' '14312' 'A/4. 20589' '358585' '243880' '2689' 'STON/O 2. 3101286' '237789' '13049' '3411' '237565' '13567' '14973' 'A./5. 3235' 'STON/O 2. 3101273' 'A/5 3902' '364848' 'SC/AH 29037' '248727' '2664' '349214' '113796' '364511' '111426' '349910' '349246' '113804' 'SOTON/O.Q. 3101305' '370377' '364512' '220845' '31028' '2659' '11753' '350029' '54636' '36963' '219533' '349224' '334912' '27042' '347743' '13214' '112052' '237668' 'STON/O 2. 3101292' '350050' '349231' '13213' 'S.O./P.P. 751' 'CA. 2314' '349221' '8475' '330919' '365226' '349223' '29751' '2623' '5727' '349210' 'STON/O 2. 3101285' '234686' '312993' 'A/5 3536' '19996' '29750' 'F.C. 12750' 'C.A. 24580' '244270' '239856' '349912' '342826' '4138' '330935' '6563' '349228' '350036' '24160' '17474' '349256' '2672' '113800' '248731' '363592' '35852' '348121' 'PC 17475' '36864' '350025' '223596' 'PC 17476' 'PC 17482' '113028' '7545' '250647' '348124' '34218' '36568' '347062' '350048' '12233' '250643' '113806' '315094' '36866' '236853' 'STON/O2. 3101271' '239855' '28425' '233639' '349201' '349218' '16988' '376566' 'STON/O 2. 3101288' '250648' '113773' '335097' '29103' '392096' '345780' '349204' '350042' '29108' '363294' 'SOTON/O2 3101272' '2663' '347074' '112379' '364850' '8471' '345781' '350047' 'S.O./P.P. 3' '2674' '29105' '347078' '383121' '36865' '2687' '113501' 'W./C. 6607' 'SOTON/O.Q. 3101312' '374887' '3101265' '12460' 'PC 17600' '349203' '28213' '17465' '349244' '2685' '2625' '347089' '347063' '112050' '347087' '248723' '3474' '28206' '364499' '112058' 'STON/O2. 3101290' 'S.C./PARIS 2079' 'C 7075' '315098' '19972' '368323' '367228' '2671' '347468' '2223' 'PC 17756' '315097' '392092' '11774' 'SOTON/O2 3101287' '2683' '315090' 'C.A. 5547' '349213' '347060' 'PC 17592' '392091' '113055' '2629' '350026' '28134' '17466' '233866' '236852' 'SC/PARIS 2149' 'PC 17590' '345777' '349248' '695' '345765' '2667' '349212' '349217' '349257' '7552' 'C.A./SOTON 34068' 'SOTON/OQ 392076' '211536' '112053' '111369' '370376' '330911' '363272' '240276' '315154' '7538' '330972' '2657' '349220' '694' '21228' '24065' '233734' '2692' 'STON/O2. 3101270' '2696' 'C 17368' 'PC 17598' '2698' '113054' 'C.A. 31029' '13236' '2682' '342712' '315087' '345768' '113778' 'SOTON/O.Q. 3101263' '237249' 'STON/O 2. 3101291' 'PC 17594' '370374' '13695' 'SC/PARIS 2168' 'SC/A.3 2861' '349230' '348122' '349232' '237216' '347090' '334914' 'F.C.C. 13534' '330963' '2543' '382653' '349211' '3101297' 'PC 17562' '359306' '11770' '248744' '368702' '19924' '349238' '240261' '2660' '330844' 'A/4 31416' '364856' '347072' '345498' '376563' '13905' '350033' 'STON/O 2. 3101268' '347471' 'A./5. 3338' '11778' '365235' '347070' '330920' '383162' '3410' '248734' '237734' '330968' 'PC 17531' '329944' '2681' '13050' '367227' '392095' '368783' '350045' '211535' '342441' 'STON/OQ. 369943' '113780' '2621' '349226' '350409' '2656' '248659' 'SOTON/OQ 392083' '17475' 'SC/A4 23568' '113791' '349255' '3701' '350405' 'S.O./P.P. 752' '347469' '110489' 'SOTON/O.Q. 3101315' '335432' '220844' '343271' '237393' 'PC 17591' '17770' '7548' 'S.O./P.P. 251' '2670' '2673' '233478' '7935' '239059' 'S.O./P.P. 2' 'A/4 48873' '28221' '111163' '235509' '347465' '347066' 'C.A. 31030' '65305' 'C.A. 34050' 'F.C. 12998' '9232' '28034' 'PC 17613' '349250' 'SOTON/O.Q. 3101308' '347091' '113038' '330924' '32302' 'SC/PARIS 2148' '342684' 'W./C. 14266' '350053' 'PC 17606' '350054' '370368' '242963' '113795' '3101266' '330971' '350416' '2679' '250650' '112377' '3470' 'SOTON/O2 3101284' '13508' '7266' '345775' 'C.A. 42795' 'AQ/4 3130' '363611' '28404' '345501' '350410' 'C.A. 34644' '349235' '112051' 'C.A. 49867' 'A. 2. 39186' '315095' '368573' '2676' 'SC 14888' 'CA 31352' 'W./C. 14260' '315085' '364859' 'A/5 21175' 'SOTON/O.Q. 3101314' '2655' 'A/5 1478' 'PC 17607' '382650' '2652' '345771' '349202' '113801' '347467' '347079' '237735' '315092' '383123' '112901' '315091' '2658' 'LP 1588' '368364' 'AQ/3. 30631' '28004' '350408' '347075' '2654' '244368' '113790' 'SOTON/O.Q. 3101309' '236854' 'PC 17580' '2684' '349229' '110469' '244360' '2675' '2622' 'C.A. 15185' '350403' '348125' '237670' '2688' '248726' 'F.C.C. 13540' '113044' '1222' '368402' '315083' '112378' 'SC/PARIS 2147' '28133' '248746' '315152' '29107' '680' '366713' '330910' 'SC/PARIS 2159' '349911' '244346' '364858' 'C.A. 30769' '371109' '347065' '21332' '17765' 'SC/PARIS 2166' '28666' '334915' '365237' '347086' 'A.5. 3236' 'SOTON/O.Q. 3101262' '359309'] Fare [ 7.25 71.2833 7.925 53.1 8.05 8.4583 51.8625 21.075 11.1333 30.0708 16.7 26.55 31.275 7.8542 16. 29.125 13. 18. 7.225 26. 8.0292 35.5 31.3875 263. 7.8792 7.8958 27.7208 146.5208 7.75 10.5 82.1708 52. 7.2292 11.2417 9.475 21. 41.5792 15.5 21.6792 17.8 39.6875 7.8 76.7292 61.9792 27.75 46.9 80. 83.475 27.9 15.2458 8.1583 8.6625 73.5 14.4542 56.4958 7.65 29. 12.475 9. 9.5 7.7875 47.1 15.85 34.375 61.175 20.575 34.6542 63.3583 23. 77.2875 8.6542 7.775 24.15 9.825 14.4583 247.5208 7.1417 22.3583 6.975 7.05 14.5 15.0458 26.2833 9.2167 79.2 6.75 11.5 36.75 7.7958 12.525 66.6 7.3125 61.3792 7.7333 69.55 16.1 15.75 20.525 55. 25.925 33.5 30.6958 25.4667 28.7125 0. 15.05 39. 22.025 50. 8.4042 6.4958 10.4625 18.7875 31. 113.275 27. 76.2917 90. 9.35 13.5 7.55 26.25 12.275 7.125 52.5542 20.2125 86.5 512.3292 79.65 153.4625 135.6333 19.5 29.7 77.9583 20.25 78.85 91.0792 12.875 8.85 151.55 30.5 23.25 12.35 110.8833 108.9 24. 56.9292 83.1583 262.375 14. 164.8667 134.5 6.2375 57.9792 28.5 133.65 15.9 9.225 35. 75.25 69.3 55.4417 211.5 4.0125 227.525 15.7417 7.7292 12. 120. 12.65 18.75 6.8583 32.5 7.875 14.4 55.9 8.1125 81.8583 19.2583 19.9667 89.1042 38.5 7.725 13.7917 9.8375 7.0458 7.5208 12.2875 9.5875 49.5042 78.2667 15.1 7.6292 22.525 26.2875 59.4 7.4958 34.0208 93.5 221.7792 106.425 49.5 71. 13.8625 7.8292 39.6 17.4 51.4792 26.3875 30. 40.125 8.7125 15. 33. 42.4 15.55 65. 32.3208 7.0542 8.4333 25.5875 9.8417 8.1375 10.1708 211.3375 57. 13.4167 7.7417 9.4833 7.7375 8.3625 23.45 25.9292 8.6833 8.5167 7.8875 37.0042 6.45 6.95 8.3 6.4375 39.4 14.1083 13.8583 50.4958 5. 9.8458 10.5167 7. 9.6875 82.2667 3.1708 31.6833 31.5 57.75 7.85 60. 15.0333 15.5792 28.5375 25.7 10.7083 13.9 7.8208 7.7792 31.6792 7.2833 75.2417 nan 12.1833 13.775 8.9625 25.7417 42.5 27.4458 136.7792 9.325 12.7375 45.5 7.575 7.5792 7.7208] Cabin [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33' 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110' 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49' 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77' 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106' 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91' 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34' 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79' 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68' 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58' 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90' 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6' 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50' 'B42' 'C148' 'B45' 'B36' 'A21' 'D34' 'A9' 'C31' 'B61' 'C53' 'D43' 'C130' 'C132' 'C55 C57' 'C116' 'F' 'A29' 'C6' 'C28' 'C51' 'C97' 'D22' 'B10' 'E45' 'E52' 'A11' 'B11' 'C80' 'C89' 'F E46' 'B26' 'F E57' 'A18' 'E60' 'E39 E41' 'B52 B54 B56' 'C39' 'B24' 'D40' 'D38' 'C105'] Embarked ['S' 'C' 'Q' nan]
##doesnt seem like we have typos
def clean(df):
#df = df.drop('Name', axis =1)
#df = df.drop('Cabin', axis =1)
#df = df.drop('Ticket', axis =1)
df['Age']= df['Age'].fillna(method = 'ffill')
df['Embarked'] = df['Embarked'].fillna(method = 'ffill')
#df['Age_'] = pd.cut(df["Age"], bins = [0,15,40,100], labels = [0,1,2])
#df = df.drop('Age', axis =1)
df['Fare']= df['Fare'].fillna(method = 'ffill')
df['Cabin']= df['Cabin'].fillna('None')
return df
df=clean(df)
df_train = df.loc[df_train.index, :]
df_test = df.loc[df_test.index, :]
px.imshow(df_train.corr(),title="Correlation Plot of the Heat Failure Prediction")
df_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 891 entries, 1 to 891 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null float64 1 Pclass 891 non-null int64 2 Name 891 non-null object 3 Sex 891 non-null object 4 Age 891 non-null float64 5 SibSp 891 non-null int64 6 Parch 891 non-null int64 7 Ticket 891 non-null object 8 Fare 891 non-null float64 9 Cabin 891 non-null object 10 Embarked 891 non-null object dtypes: float64(3), int64(3), object(5) memory usage: 83.5+ KB
for col in df_train.columns:
fig=px.histogram(df_train,
x="Survived",
color=col,
title="Distribution of survivals vs " + col,
barmode="group")
fig.show()
df_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 891 entries, 1 to 891 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null float64 1 Pclass 891 non-null int64 2 Name 891 non-null object 3 Sex 891 non-null object 4 Age 891 non-null float64 5 SibSp 891 non-null int64 6 Parch 891 non-null int64 7 Ticket 891 non-null object 8 Fare 891 non-null float64 9 Cabin 891 non-null object 10 Embarked 891 non-null object dtypes: float64(3), int64(3), object(5) memory usage: 83.5+ KB
features_nom = ["Survived","Sex", "Embarked","Pclass","Cabin"]
#categories
for name in features_nom:
df[name] = df[name].astype("category")
names_splits = df.Name.str.split(pat = ",",n=1, expand=True)
names_splits = names_splits[1].str.split(pat = "." , n=1, expand=True)
#names_splits = names_splits[1].str.split(pat = "" , n=1, expand=True)
df['Jobs'] = names_splits.iloc[:,0]
df['Jobs'].unique()
array([' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
' the Countess', ' Jonkheer', ' Dona'], dtype=object)
# df['Titles'] = df['Jobs'].apply(lambda x:'Army' if x in [' Rev' , ' Major' , ' Col' ,' Capt' ] else x)
# df['Titles'] = df['Titles'].apply(lambda x:'Royal' if x in [' Don' , ' Lady' , ' Sir' ,' the Countess' , ' Jonkheer', ' Dona' ] else x)
# df['Titles'] = df['Titles'].apply(lambda x:'Normal' if x in [' Mr' , ' Mrs',' Miss',' Master',' Dr',' Mme',' Ms',' Mlle','' ] else x)
# df['Titles'].unique()
df['Jobs'].unique()
array([' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
' the Countess', ' Jonkheer', ' Dona'], dtype=object)
# fig=px.histogram(df,
# x="Survived",
# color='Titles',
# title="Distribution of survivals vs Titles",
# barmode="group")
# fig.show()
age_Group = pd.cut(df["Age"], bins = [0,6,18,35,65,80], labels = ["childs","Kids","adults","MiddleAged","Old"])
df["Age_Bins"] = age_Group
fig=px.histogram(df,
x="Survived",
color='Age_Bins',
title="Distribution of survivals vs AgeBins",
barmode="group")
fig.show()
df["FamTotal"] = df["SibSp"] + df["Parch"]
fig=px.histogram(df,
x="Survived",
color='FamTotal',
title="Distribution of survivals vs FamTotal",
barmode="group")
fig.show()
FamilyBins = pd.cut(df["FamTotal"], bins = [-1,1,4,7,10], labels = ["NoFam","few","mediamfam",'Bigfam'])
df["FamilyBins"] = FamilyBins
fig=px.histogram(df,
x="Survived",
color='FamilyBins',
title="Distribution of survivals vs FamilyBins",
barmode="group")
fig.show()
df['AlphaTicket'] = df["Ticket"].apply(lambda x:1 if (x.islower()) or (x.isupper()) else 0)
fig=px.histogram(df,
x="Survived",
color='AlphaTicket',
title="Distribution of survivals vs AlphaTicket",
barmode="group")
fig.show()
plt.figure(figsize=(15,10))
plt.title(f"Distribution of Fare Data")
sns.histplot(df['Fare'],kde=True)
plt.tight_layout()
plt.plot()
[]
FareBins = pd.cut(df["Fare"], bins = [-1,50,100,265,1000], labels = ["poor","normal","rich",'veryrich'])
df["FareBins"] = FareBins
fig=px.histogram(df,
x="Survived",
color='FareBins',
title="Distribution of survivals vs FareBins",
barmode="group")
fig.show()
df['CabinLet'] = df.Cabin.str.split(pat = '0|1|2|3|4|5|6|7|8|9',n=1, expand=True )[0]
df['CabinLet'] = df['CabinLet'].apply(lambda x: x if len(x)< 1 else x[0])
fig=px.histogram(df,
x="Survived",
color='CabinLet',
title="Distribution of survivals vs CabinLet",
barmode="group")
fig.show()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1309 entries, 1 to 1309 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null category 1 Pclass 1309 non-null category 2 Name 1309 non-null object 3 Sex 1309 non-null category 4 Age 1309 non-null float64 5 SibSp 1309 non-null int64 6 Parch 1309 non-null int64 7 Ticket 1309 non-null object 8 Fare 1309 non-null float64 9 Cabin 1309 non-null category 10 Embarked 1309 non-null category 11 Jobs 1309 non-null object 12 Age_Bins 1309 non-null category 13 FamTotal 1309 non-null int64 14 FamilyBins 1309 non-null category 15 AlphaTicket 1309 non-null int64 16 FareBins 1309 non-null category 17 CabinLet 1309 non-null object dtypes: category(8), float64(2), int64(4), object(4) memory usage: 162.9+ KB
df = df.drop('Name', axis=1)
df = df.drop('Ticket', axis=1)
df= df.drop('Cabin', axis=1)
#df= df.drop('Jobs', axis=1)
features_nom = ["Jobs","AlphaTicket", "CabinLet"]
#categories
for name in features_nom:
df[name] = df[name].astype("category")
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1309 entries, 1 to 1309 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null category 1 Pclass 1309 non-null category 2 Sex 1309 non-null category 3 Age 1309 non-null float64 4 SibSp 1309 non-null int64 5 Parch 1309 non-null int64 6 Fare 1309 non-null float64 7 Embarked 1309 non-null category 8 Jobs 1309 non-null category 9 Age_Bins 1309 non-null category 10 FamTotal 1309 non-null int64 11 FamilyBins 1309 non-null category 12 AlphaTicket 1309 non-null category 13 FareBins 1309 non-null category 14 CabinLet 1309 non-null category dtypes: category(10), float64(2), int64(3) memory usage: 108.7 KB
Y = df_train.pop("Survived")
df = df.drop('Survived', axis=1)
df = pd.get_dummies(df)
df_train = df.loc[df_train.index, :]
df_test = df.loc[df_test.index, :]
mi_scores = mutual_info_regression(df_train, Y, random_state=0)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=df_train.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores
Sex_male 0.222795 Jobs_ Mr 0.207942 Sex_female 0.135846 Fare 0.127942 Jobs_ Mrs 0.071473 Pclass_3 0.054797 Pclass_2 0.047015 FamilyBins_NoFam 0.042736 Pclass_1 0.040613 CabinLet_N 0.039263 FareBins_poor 0.035934 CabinLet_E 0.033449 SibSp 0.033286 Embarked_C 0.032546 CabinLet_B 0.027854 FareBins_normal 0.022931 Age 0.022358 Age_Bins_Kids 0.020760 FareBins_veryrich 0.020495 FamilyBins_few 0.019499 FamTotal 0.017400 Jobs_ Don 0.017348 FamilyBins_Bigfam 0.015997 CabinLet_C 0.014716 AlphaTicket_0 0.010251 Jobs_ Rev 0.009886 Jobs_ Miss 0.008905 CabinLet_G 0.005904 CabinLet_F 0.004515 Age_Bins_MiddleAged 0.004006 FamilyBins_mediamfam 0.003912 Embarked_Q 0.001166 Jobs_ Capt 0.001141 AlphaTicket_1 0.000000 FareBins_rich 0.000000 CabinLet_A 0.000000 CabinLet_D 0.000000 Jobs_ Ms 0.000000 Age_Bins_Old 0.000000 Jobs_ Lady 0.000000 Parch 0.000000 Embarked_S 0.000000 Jobs_ Col 0.000000 Jobs_ Dona 0.000000 Jobs_ Dr 0.000000 Jobs_ Jonkheer 0.000000 Jobs_ Major 0.000000 Age_Bins_adults 0.000000 Jobs_ Master 0.000000 Jobs_ Mlle 0.000000 Jobs_ Mme 0.000000 Jobs_ Sir 0.000000 Jobs_ the Countess 0.000000 Age_Bins_childs 0.000000 CabinLet_T 0.000000 Name: MI Scores, dtype: float64
# from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(
# df_train, Y, test_size=0.20, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
for i in range (80 , 400 , 10):
clf = RandomForestClassifier(n_estimators=i , max_depth=6,
random_state=0,min_samples_split=3,min_samples_leaf=1)
scores = cross_val_score(clf, df_train, Y, cv=5)
plt.scatter(i , scores.mean() )
clf = RandomForestClassifier(n_estimators=380 , max_depth=6,
random_state=0,min_samples_split=3
,min_samples_leaf=1)
clf.fit(df_train, Y)
#clf = RandomForestClassifier(n_estimators=140 , max_depth=6, random_state=42) best yet
RandomForestClassifier(max_depth=6, min_samples_split=3, n_estimators=380,
random_state=0)
preds = clf.predict(df_test).astype('int')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf = LogisticRegression(random_state=0,max_iter=400)
scores = cross_val_score(clf, df_train, Y, cv=5)
scores.mean()
0.817079907099366
output = pd.DataFrame({'PassengerId': df_test.index, 'Survived': preds})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!